# contributions:
#     corona: Jake acquired the data set & wrote the paragraph text, Will wrote
#             wrangling & graph code. Since this was our story baseline, it was
#             revised a few times afterwards by both of us. 
#     retail: Jake
#     flight: Will

pacman::p_load(tidyverse, lubridate)

#covid <- read_csv("https://covid.ourworldindata.org/data/owid-covid-data.csv")
covid <- read_csv("data/owid-covid-data.csv")
## Rows: 133803 Columns: 67
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr   (4): iso_code, continent, location, tests_units
## dbl  (62): total_cases, new_cases, new_cases_smoothed, total_deaths, new_dea...
## date  (1): date
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
covid$date <- as_date(covid$date)

# rename things from covid to match map_data("world")
covid$location[covid$location == "United States"] <- "USA"
covid$location[covid$location == "United Kingdom"] <- "UK"
covid$location[covid$location == "Czechia"] <- "Czech Republic"
covid$location[covid$location == "Democratic Republic of Congo"] <-
    "Democratic Republic of the Congo"
covid$location[covid$location == "Congo"] <-
    "Republic of Congo"
covid$location[covid$location == "Cote d'Ivoire"] <- "Ivory Coast"

Corona

Corona has hit the entire world, but at drastically different levels. Some industries faced massive decline with no recovery in sight, while others have been virtually immune. The release of new vaccines has brought some hope to these industries, but how helpful are they really? We’re going to do a comparison with a few key variables.

Although our dataset gives us many variables to start from, what seems to be closest to what we want is “total_cases_per_million”, which by viewing from a few countries over time seems to only increase. This means we can use the latest total for each country as a rough number of how many cases they’ve had overall.

covid %>% select("location", "date", "total_cases_per_million") %>% 
    filter(location == c("USA", "Germany", "Japan", "Brazil", "India")) %>% 
    group_by(location) %>%
    ggplot(mapping = aes(x = date, y = total_cases_per_million)) +
    geom_line(mapping = aes(color = location)) +
    labs(title = "total corona cases per million")

This means we can use the latest total for each country as a rough number of how many cases they’ve had overall. The dataset is updated regularly, but for the purposes of this project I will use Oct. 31st of this year as the most recent date so we will filter by that & see how many total corona cases we have on a world map. In a way, we can view this as a metric of how heavy corona has hit a country.

totals <- covid %>% select("location", "date", "total_cases_per_million") %>%
    filter(month(date) == 10 & day(date) == 31 & year(date) == 2021)
map_data("world") %>%
    rename(location = region) %>% 
    left_join(totals, by = "location") %>% 
    ggplot(mapping = aes(x = long, y = lat, group = group,
                         fill = total_cases_per_million)) +
    geom_polygon(color = "black") +
    coord_map(xlim = c(-180, 180), ylim=c(-60, 90)) +
    scale_fill_distiller(palette = "Reds", trans = "reverse") +
    labs(x = element_blank(), y = element_blank()) +
    guides(fill = guide_legend(
        title = "Total cases per\nmillion as of\nOct. 31st, 2021"))

Next we will pick 3 industries to graph the decline of. Although any industry can be measured in its own numerous ways, the important thing is that we have a number from before corona (normal) & after corona (impacted). We can use the worst number, the highest amount of decline, to measure how badly this industry was impacted. The dates of these minimums will also be in a scatter plot to see how useful the data is.

1. Retail

The industry I researched was retail and recreation. This includes restaurants, stores, malls, and other places of general recreation, such as movie theaters, bowling alleys, and sporting events. Retail & recreation was best measured on a day to day basis. Looking at this industry will give an important glimpse into the effects of COVID-19 and how it changed the everyday lives of people. Retail & recreation was best measured on a day to day basis.

After creating the data frame from the retail and recreation statistics, I selected 11 countries to take a closer look at. These countries, The United States, Germany, Japan, Brazil, India, & Australia are used to most closely represent the entire world. These countries were chosen because they span a variety of economic rankings and geographical locations. There are representatives from each continent other than Antarctica, and economic standings from the very top to the bottom of the world.

retail <- read.csv("data/retail.csv")
retail$Day <- ymd(retail$Day) %>% 
  as_date()
selectedCountries <- c("United States", "Germany", "Japan", "Brazil", 
                       "India", "Australia")

retail_filter <- retail %>%
  filter(Entity == selectedCountries) %>%
  select(c('Entity', 'Day', 'retail_and_recreation'))

The retail and recreation data frame consists of each individual day from February 17th, 2020 to present day. A separate column calculates the percent change in visitors on each of these days in a given country. The resulting graph, plotted with date on the x-axis and percent change on the y, shows how a countries retail and recreation industry has fluctuated over the last year plus.

retail_filter %>%
  ggplot(mapping = aes(x = Day, y = retail_and_recreation)) +
  geom_line(mapping = aes(group = Entity, color = Entity)) +
  ggtitle("Retail and Recreation Day to Day") +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank())

retail_filter %>% 
    filter(year(Day) == 2020) %>% 
    group_by(month(Day)) %>% 
    ggplot(mapping = aes(x = Day, y = retail_and_recreation)) +
    geom_line(mapping = aes(group = Entity, color = Entity)) +
    ggtitle("Retail and Recreation by 2020 months") +
    theme(axis.title.y=element_blank(),
          axis.title.x=element_blank()) +
    facet_wrap(~month(Day), scales = "free_x")

retail_filter %>% 
    filter(year(Day) == 2021) %>% 
    group_by(month(Day)) %>% 
    ggplot(mapping = aes(x = Day, y = retail_and_recreation)) +
    geom_line(mapping = aes(group = Entity, color = Entity)) +
    ggtitle("Retail and Recreation by 2021 months") +
    theme(axis.title.y=element_blank(),
          axis.title.x=element_blank()) +
    facet_wrap(~month(Day), scales = "free_x")

Overall, we can see the first large dip in industry success around March of 2020. This makes sense, as the first worldwide global shutdown due to the pandemic was in this month. People were staying home, and not giving the necessary business to their favorite stores and restaurants. This led to many businesses, especially smaller ones, closing. The Wall Street Journal says there was up to 200,000 extra business closures in the pandemic’s first year alone. The pandemic destroyed the lives of many business owners, and this plot shows that beginning in March. Since March, the general trend of the industry has been positive, with some obvious fluctuation along the way. To show what part of this fluctuation resulted naturally and which part was the fault of COVID-19, we graphed the number of corona cases per million since last February. With general spikes between December of 2020 and January of 2021, March and April of 2021, and as recently as August and September of this year,we can see the ups and downs of COVID cases. To discover how these numbers affected the retail and recreation industry, and eventually if the vaccine helped, I took a further look into three countries; The States, Brazil, and India. The choice of these three countries was driven by GDP per capita - GDP divided by the number of residents of a country. Purchasing power parity is GDP per capita that also taking into account inflation rates and the cost of local goods and services to get a more accurate picture of a nation’s average standard of living. By looking at a country’s average living standards, we can get a good idea of how much the everyday person can afford to partake in retail and recreation activities.

pacman::p_load(kableExtra)
ppp <- read_csv("data/ppp.csv")

ppp %>%
  kbl(caption = "Purchasing Power Parity Around the World", booktabs = T) %>%
  kable_styling(bootstrap_options = 'striped') %>%
  column_spec(2, color = 'green') %>% 
    column_spec(1, color = "black") %>% 
    column_spec(3, color = "black") %>% 
    row_spec(0, color  = "black")
Purchasing Power Parity Around the World
Country GDP-$PPP World Ranking
United States 63146 6
Germany 54076 19
Australia 51680 20
France 46062 26
Japan 42248 30
Mexico 19130 72
Brazil 14916 85
Ukraine 13110 95
India 6461 128
Nigeria 5187 142

Shown in the table are the PPP’s of the 11 countries listed above. After choosing the United States, who comes in at 6th at 63,416, Brazil, 85th at 14,916, and India, 128th at 6,461, we see a top 10 country, a middle of the road country, and a relatively poor country. Most countries more poor than India do not have sufficient vaccination data to successfully study. While many Middle Eastern and African countries have lower GDPs per capita, they simply do not have well documented vaccination statistics. Still, with these three countries, we will be able to see how corona numbers affected industries with varying economic status.

selectedCountries_us <- c("United States")
selectedCountries_brazil <- c("Brazil")
selectedCountries_india <- c("India")

retail_filter_us <- retail_filter %>%
  filter(Entity == selectedCountries_us)
retail_wide_us <- retail_filter_us %>%
  pivot_wider(names_from = Entity, values_from = retail_and_recreation)

retail_filter_brazil <- retail_filter %>%
  filter(Entity == selectedCountries_brazil)
retail_wide_brazil <- retail_filter_brazil %>%
  pivot_wider(names_from = Entity, values_from = retail_and_recreation)

retail_filter_india <- retail_filter %>%
  filter(Entity == selectedCountries_india)
retail_wide_india <- retail_filter_india %>%
  pivot_wider(names_from = Entity, values_from = retail_and_recreation)

After individually graphing the US, Brazil, and India’s industry percentages, corona cases and vaccine rates, we can see how corona really affected each countries retail and recreation.

retail_wide_us %>%
  ggplot() +
  geom_bar(mapping = aes(x = Day, y = `United States`, fill = Day), 
           stat = "identity") +
  ggtitle("US Retail and Recreation by Day") +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank(),
        legend.position = "none") +
  scale_fill_viridis_c(option = "magma")

covid %>% select("location", "date", "new_cases_smoothed_per_million") %>% 
    filter(location == "USA") %>% 
  group_by(location) %>%
  ggplot(mapping = aes(x = date, y = new_cases_smoothed_per_million)) +
  geom_line(mapping = aes(color = location)) +
  labs(title = "US Corona Cases Per Million")

covid %>% select("location", "date", "people_fully_vaccinated_per_hundred") %>% 
    filter(location == "USA") %>% 
  group_by(location) %>%
  ggplot(mapping = aes(x = date, y = people_fully_vaccinated_per_hundred)) +
  geom_line(mapping = aes(color = location)) +
  labs(title = "US Vaccination Percentage")

As we can see, there is a direct correlation between rising corona cases and falling industry success. At the beginning of the global shutdown in March, we see the percentage of visitors on a daily basis plummet near negative 50%. At the same time, corona cases in the United States begin to rise. The point where the correlation really sticks out is around October of 2020. Just before this time, the retail and recreation industry was slowly beginning to rise towards an average amount of visitors. Then, in October, we can see a huge peak in corona cases. Right after this outbreak, the industry begins to crash again, all the way back down near negative 30%. Once this peak starts to fall, we can see the industry slowly start to recover again. This definitely shows a relation between corona cases and the success of the retail and recreation industry in the United States - the more cases there are, the more stores and restaurants get shut down, the more people stay home, and the more the industry suffers. We can also see a possible effect of the vaccine in the plots. Around September of 2021, we see another outbreak of corona cases in the US with a peak nearing the one of October 2020. But, we do not see the retail and recreation industry struggle and collapse like before. A possible reason? The increase in percent of people in the US with the vaccine. More people with the vaccine means less people are scared to leave their home, even amid a case outbreak. Alongside this, places of retail, including big events like concerts and sporting events, were more likely to stay open with vaccination numbers rising. Some places and events even required vaccination to enter, but regardless it is clear they were far more likely to stay open with vaccine numbers rising.

retail_wide_brazil %>%
  ggplot() +
  geom_bar(mapping = aes(x = Day, y = Brazil, fill = Day), 
           stat = "identity") +
  ggtitle("Brazil Retail and Recreation by Day") +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank(),
        legend.position = "none") +
  scale_fill_viridis_c(option = "magma")

covid %>% select("location", "date", "new_cases_smoothed_per_million") %>% 
  filter(location == selectedCountries_brazil) %>% 
  group_by(location) %>%
  ggplot(mapping = aes(x = date, y = new_cases_smoothed_per_million)) +
  geom_line(mapping = aes(color = location)) +
  labs(title = "Brazil Corona Cases Per Million")

covid %>% select("location", "date", "people_fully_vaccinated_per_hundred") %>% 
  filter(location == selectedCountries_brazil) %>% 
  group_by(location) %>%
  ggplot(mapping = aes(x = date, y = people_fully_vaccinated_per_hundred)) +
  geom_line(mapping = aes(color = location)) +
  labs(title = "Brazil Vaccination Percentage")

Looking at Brazil, a country with far less GDP per capita, we see a very similar story. Around March of 2020, the industry crashes the same time that the first global lock down occurs and corona cases rise across the country. One difference between Brazil and the United States is that the lowest dip we see mostly hangs around the high 30s opposed to the high 40s in the US. But, similarly to the US, as the original high point of cases begins to slope downwards, we see the industry start to recover, rising back near a 15 percent drop. Then, the peak of corona cases hits Brazil between March andApril of 2021. When this happens, the industry crashes again as expected, even though not as badly. The interesting section of Brazil’s data is how rapidly the cases slope downwards come July of 2021. Around this time, we see the vaccine number rise, which is the cause of the drop of cases. With the vaccine helping cases drop, Brazilians began to hit stores, restaurants, and public events again, proven by how the percentage of visitors rises back near their original average.

retail_wide_india %>%
  ggplot() +
  geom_bar(mapping = aes(x = Day, y = India, fill = Day),
           stat = "identity") +
  ggtitle("India Retail and Recreation by Day") +
  theme(axis.title.y=element_blank(),
        axis.title.x=element_blank(), 
        legend.position = "none") +
  scale_fill_viridis_c(option = "magma")

covid %>% select("location", "date", "new_cases_smoothed_per_million") %>% 
  filter(location == selectedCountries_india) %>% 
  group_by(location) %>%
  ggplot(mapping = aes(x = date, y = new_cases_smoothed_per_million)) +
  geom_line(mapping = aes(color = location)) +
  labs(title = "India Corona Cases Per Million")

covid %>% select("location", "date", "people_fully_vaccinated_per_hundred") %>% 
  filter(location == selectedCountries_india) %>% 
  group_by(location) %>%
  ggplot(mapping = aes(x = date, y = people_fully_vaccinated_per_hundred)) +
  geom_line(mapping = aes(color = location)) +
  labs(title = "India Vaccination Percentage")

India brought a whole different story to the table. When March came around, the country’s cases stayed relatively low. The reason for this is shown by the industry’s visitor percentage. Around this time, the retail and recreation industry dropped below a negative 80 percent loss of customers - double that of the United States and Brazil. This shows how serious the original lock down was in India, as the industry nearly disappeared completely. After the industry started to reopen and approach its original point, a corona outbreak, India’s largest by far, occurred in May 2021. After this, the country appeared to go into full lock down again, with the industry dropping below negative 60 percent visitor loss. Once India’s vaccine levels started to really rise in July, the peak dropped off and the industry started to recover.

Overall, we can see similarities and differences in the ways that a high, middle, and low GDP per capita country were affected by covid. The United States had an easier time recovering the retail and recreation industry, while only dealing with two corona peaks. Brazil had more ups and down in their corona cases, but the vaccine seemed to help them out more than the US. On the poor end, it seems countries are far more likely to go into a strict lock down like India did, nearly erasing their retail and recreation industry. This helped them deal with only one large corona outbreak, which appears to have been significantly helped by the release of the vaccine.

Corona affected both rich and poor countries, high and low population countries, and countries in varying geographic areas. It did not care what was in its path, and one of the things it destroyed across the world was the retail and recreation industry. The vaccine has aided in the return of this industry, attempting to open up places of business return their owners to where they belong.

2. Airlines

Airline travel basically came to a halt. Fortunately for our purposes, we can simply use the total number of flights a country has had. Let’s 1st graph the entire world in terms of cases & flights.

flights <- read_csv("data/flights.csv") # "flights.r" creates this csv
flights$flightCount <- as.numeric(flights$flightCount)
flights$num <- as.numeric(flights$num)
flights$date <- ymd(flights$date) %>% as_date()
# to match map_data("world")
flights$location[flights$location == "United States of America (the)"] <- "USA"
flights$location[flights$location ==
    "United Kingdom of Great Britain and Northern Ireland (the)"] <- "UK"
flights$location[flights$location == "Russian Federation (the)"] <- "Russia"
flights$location[flights$location == "Korea (the Republic of)"] <- "South Korea"
flights$location[flights$location == 
    "Bolivia (Plurinational State of)"] <- "Bolivia"
flights$location[flights$location == "Viet Nam"] <- "Vietnam"
flights$location[flights$location ==
    "Central African Republic (the)"] <- "Central African Republic"
flights$location[flights$location == 
    "Congo (the Democratic Republic of the)"] <-
    "Democratic Republic of the Congo"
flights$location[flights$location == "Congo (the)"] <- "Republic of Congo"
flights$location[flights$location ==
    "Dominican Republic (the)"] <- "Dominican Republic"
flights$location[flights$location == "Gambia (the)"] <- "Gambia"
flights$location[flights$location == "Iran (Islamic Republic of)"] <- "Iran"
flights$location[flights$location ==
    "Korea (the Democratic People's Republic of)"] <- "North Korea"
flights$location[flights$location ==
    "Lao People's Democratic Republic (the)"] <- "Laos"
flights$location[flights$location == "Moldova (the Republic of)"] <- "Moldova"
flights$location[flights$location == "Côte d'Ivoire"] <- "Ivory Coast"
flights$location[flights$location == "Philippines (the)"] <- "Philippines"
flights$location[flights$location == "Sudan (the)"] <- "Sudan"
flights$location[flights$location == "Taiwan (Province of China)"] <- "Taiwan"
flights$location[flights$location ==
    "Tanzania, United Republic of"] <- "Tanzania"
flights$location[flights$location ==
    "United Arab Emirates (the)"] <- "United Arab Emirates"
flights$location[flights$location ==
    "Venezuela (Bolivarian Republic of)"] <- "Venezuela"
flights$location[flights$location == "Czechia"] <- "Czech Republic"
flights$location[flights$location == "Syrian Arab Republic"] <- "Syria"
flights$location[flights$location == "Niger (the)"] <- "Niger"
flights$location[flights$location == "Netherlands (the)"] <- "Netherlands"

flightsGlobal <- setNames(
    data.frame(matrix(ncol = 2, nrow = 0)), c("date", "flightCount"))
dateRange <- seq(as.Date("2020-01-01"), as.Date("2021-10-31"), by = "days")
for (i in 1:length(dateRange)) {
    tempDay <- filter(flights, date == dateRange[i])
    tempTotal <- sum(tempDay$flightCount)
    flightsGlobal <- rbind(flightsGlobal, data.frame(
        date = dateRange[i], flightCount = tempTotal))
}
rm(i, tempTotal)

casesGlobal <- covid %>%
    filter(location == "World") %>% 
    select(date, new_cases_smoothed_per_million) %>% 
    na.omit()

# acquire a constant to map the "middle" of one vector to another. This will be
# used for dual line graphs with different y value ranges
scaler <- function(secondAxis, firstAxis) {
    (min(secondAxis) + max(secondAxis)) / (min(firstAxis) + max(firstAxis))
}

scale <- scaler(flightsGlobal$flightCount, casesGlobal$new_cases_smoothed_per_million)
inner_join(flightsGlobal, casesGlobal, by = "date") %>% 
    ggplot(aes(x = date)) +
    geom_line(aes(y = flightCount / scale), color = "blue") +
    geom_line(aes(y = new_cases_smoothed_per_million), color = "red") +
    scale_y_continuous(
        name = "new cases", sec.axis = sec_axis(~.*scale, name = "flights")) +
    theme(axis.title.y = element_text(color = "red"),
          axis.title.y.right = element_text(color = "blue")) +
    ggtitle("Global new cases per million vs. flights per day")

As you can see, it doesn’t seem to line up too well. We do have 1 significant drop from March to July 2020, with only the 2nd & 3rd case surges hindering growth. However, flights were shut down as soon as corona started, & the industry has been very careful about opening back up, as the business is very risky. Also, things could be very different depending on the country. Let’s look at a few specific countries to see what the data really tells.

population <- read_csv("data/WPP2019_TotalPopulationBySex.csv") %>% 
    filter(Time == 2020) %>% # now
    select(LocID, Location, PopTotal) %>% # dont need extra stuff
    unique() %>% # collapse other values idk what they meant
    mutate(PopTotal = PopTotal * 1000) %>% # population is in 1000s idk why
    rename(num = LocID) %>%
    rename(location = Location) %>%
    rename(pop = PopTotal)
flightsPop <- inner_join(flights, population, by = "num") %>% 
    select(-location.y) %>% 
    rename(location = location.x)

focus <-  c("USA", "Germany", "Japan", "Brazil", "India", "Australia")

flightsPop %>%
    filter(location == focus) %>% 
    group_by(location) %>%
    ggplot(mapping = aes(x = date, y = flightCount / pop)) +
    geom_line(mapping = aes(color = location)) +
    labs(title = "# of flights per capita")

The flights between countries of similar flight counts per capita also seems to vary. So to get a better story, we’ll need to look at some individual countries.

for (country in focus) {
    flightsTemp <- flights %>% filter(location == country) %>%
        select(date, flightCount)
    casesTemp <- covid %>% filter(location == country) %>%
        select(date, new_cases_smoothed) %>% na.omit()
    
    scale <- scaler(flightsTemp$flightCount, casesTemp$new_cases_smoothed)

    print(inner_join(flightsTemp, casesTemp, by = "date") %>% 
        ggplot(aes(x = date)) +
        geom_line(aes(y = new_cases_smoothed), color = "red") +
        geom_line(aes(y = flightCount / scale), color = "blue") +
        scale_y_continuous(
            name = "new cases", sec.axis = sec_axis(~.*scale, name = "flights")) +
        theme(axis.title.y = element_text(color = "red"),
              axis.title.y.right = element_text(color = "blue")) +
        ggtitle(paste0(country, ", cases vs. flights")))
}

rm(flightsTemp, casesTemp)

As one can see, it is still a bit difficult to read. However, we can see that the initial outbreak caused a massive dip in every country, with each successive peak hindering recovery. Is there a way to compare how much a country’s airline industry was affected compared to another? We can take the pre-corona levels & the lowest point in that country’s air traffic, then compare it to the amount of total cases a country had by the end of our previous mentioned initial air traffic drop: July 2020. The formula would be

Although the number we get by itself doesn’t really mean anything, we can use it to compare every country on a map. Before we do that however, let’s make sure that these minimums & maximums are not too spread out time-wise, otherwise this statistic could be weakly founded.

everyCountry <- flights %>% 
    select(location) %>% 
    unique()
everyCountry <- everyCountry$location

flightImpact <- setNames(data.frame(matrix(ncol = 5, nrow = 0)),
    c("num", "location", "flights", "date", "extremity"))

for (country in everyCountry) {
    tempFlights <- flights %>% filter(location == country) %>%
        select(num, location, date, flightCount)
    max <- tempFlights[tempFlights$flightCount == max(tempFlights$flightCount),]
    min <- tempFlights[tempFlights$flightCount == min(tempFlights$flightCount),]
    
    flightImpact <- rbind(flightImpact, data.frame(
        num = tempFlights$num[1],
        location = country,
        flights = min$flightCount[1],
        date = min$date[1],
        extremity = "min"
    ))
    flightImpact <- rbind(flightImpact, data.frame(
        num = tempFlights$num[1],
        location = country,
        flights = max$flightCount[1],
        date = max$date[1],
        extremity = "max"
    ))
}

flightImpact %>% 
    ggplot(aes(x = date, fill = extremity)) +
    geom_histogram(binwidth = 4, alpha = 0.5, position = "identity") +
    scale_fill_manual(values=c("Red", "Blue"))

We seem to have quite a few outliers much later than expected, but this is because some countries such as Russia have had their airline industries recover already. However the rest of the dates bunch up all around the start, so our data is good here. To make sure we use the correct mins & maxes caused by the 1st wave of corona, let’s use everything before July 2020 to calculate our impact levels & map it.

flightImpact <- setNames(data.frame(matrix(ncol = 8, nrow = 0)),
    c("num", "iso", "location", "minFlights", "minDate", "maxFlights",
      "maxDate", "impact"))
for (country in everyCountry) {
    tempFlights <- flights %>%
        filter(location == country & date < as.Date("2020-07-01")) %>%
        select(num, iso, location, date, flightCount)
    max <- tempFlights[tempFlights$flightCount == max(tempFlights$flightCount),]
    min <- tempFlights[tempFlights$flightCount == min(tempFlights$flightCount),]
    
    flightImpact <- rbind(flightImpact, data.frame(
        num = tempFlights$num[1],
        iso = tempFlights$iso[1],
        location = country,
        minFlights = min$flightCount[1],
        minDate = min$date[1],
        maxFlights = max$flightCount[1],
        maxDate = max$date[1],
        impact = 100 - (100 * min$flightCount[1] / max$flightCount[1])
    ))
}

map_data("world") %>%
    rename(location = region) %>% 
    left_join(flightImpact, by = "location") %>% 
    ggplot(mapping = aes(x = long, y = lat, group = group, fill = impact)) +
    geom_polygon(color = "black") +
    coord_map(xlim = c(-180, 180), ylim=c(-60, 90)) +
    scale_fill_distiller(palette = "Reds", trans = "reverse") +
    labs(x = element_blank(), y = element_blank()) +
    guides(fill = guide_legend(title = "% drop in air traffic from corona"))

Ah North Korea, invincible as always. Other than that, no country is visibly below maybe 60%. So it’s safe to say that corona has affected most countries air traffic pretty heavily. Now let’s see if the vaccines are helping.

vaxxGlobal <- covid %>%
    filter(location == "World") %>% 
    select(date, people_fully_vaccinated_per_hundred) %>% 
    na.omit()

scale <- scaler(flightsGlobal$flightCount, vaxxGlobal$people_fully_vaccinated_per_hundred)
inner_join(flightsGlobal, vaxxGlobal, by = "date") %>% 
    ggplot(aes(x = date)) +
    geom_line(aes(y = flightCount / scale), color = "blue") +
    geom_line(aes(y = people_fully_vaccinated_per_hundred), color = "darkgreen") +
    scale_y_continuous(
        name = "vaccine rate", sec.axis = sec_axis(~.*scale, name = "flights")) +
    theme(axis.title.y = element_text(color = "darkgreen"),
          axis.title.y.right = element_text(color = "blue")) +
    ggtitle("% vaccinated vs. flights per day")

Is it working? I mean, maybe a little at the start. We have a massive spike in vaccines which seems to be completely ignored by the airline industry. Again, it might be better to look at other countries.

for (country in focus) {
    flightsTemp <- flights %>% filter(location == country) %>%
        select(date, flightCount)
    vaxxTemp <- covid %>% filter(location == country) %>%
        select(date, people_fully_vaccinated_per_hundred) %>% na.omit()
    
    scale <- scaler(flightsTemp$flightCount, vaxxTemp$people_fully_vaccinated_per_hundred)
    print(inner_join(flightsTemp, vaxxTemp, by = "date") %>% 
        ggplot(aes(x = date)) +
        geom_line(aes(y = people_fully_vaccinated_per_hundred), color = "darkgreen") +
        geom_line(aes(y = flightCount / scale), color = "blue") +
        ylim(0, 100) +
        scale_y_continuous(
            name = "vaccine rate",
            sec.axis = sec_axis(~.*scale, name = "flights")) +
        theme(axis.title.y = element_text(color = "darkgreen"),
              axis.title.y.right = element_text(color = "blue")) +
        ggtitle(paste0(country, ", vaccines vs. flights")))
}

rm(flightsTemp, vaxxTemp)

It does seem to help, except somehow Australia seems to decrease in flights. In spite of this based on these graphs we can assume that it is reasonably effective for the other countries. We can’t look at the graph of every single country in the world, so let’s try yielding a single number that shows the correlation between airline recovery & vaccine rate increases. The recovery of the airline industry can be derived in the following

We can also simply use the vaccine growth rate from March to now to also get a number to compare to. Mapping these correlations shows us how effective they are across the world.

# create empty columns
flightImpact["febFlights"] <- NA
flightImpact["currentFlights"] <- NA
flightImpact["febVaxx"] <- NA
flightImpact["currentVaxx"] <- NA
flightImpact["flightVaxxCorrelation"] <- NA
    
for (country in everyCountry) {
    # set febFlights
    tempFlights <- flights %>%
        filter(location == country & date == as.Date("2021-02-01")) %>%
        select(num, location, date, flightCount)
    flightImpact$febFlights[flightImpact$location == country] <-
        tempFlights$flightCount[1]
    # set currentFlights
    tempFlights <- flights %>%
        filter(location == country & date == as.Date("2021-10-31")) %>%
        select(num, location, date, flightCount)
    flightImpact$currentFlights[flightImpact$location == country] <-
        tempFlights$flightCount[1]
    
    tempVaxx <- covid %>%
        select(location, date, people_fully_vaccinated_per_hundred) %>% 
        filter(location == country & 
            (date == as.Date("2021-02-01") | date == as.Date("2021-10-31")))
    flightImpact$febVaxx[flightImpact$location == country] <- 
        tempVaxx$people_fully_vaccinated_per_hundred[1]
    flightImpact$currentVaxx[flightImpact$location == country] <- 
        tempVaxx$people_fully_vaccinated_per_hundred[2]
    
    # make correlation
    tempFlights <- flights %>%
        select(num, location, date, flightCount) %>% 
        filter(location == country &
            (date >= as.Date("2021-02-01") & date <= as.Date("2021-10-31")))
    tempFlights$flightCount <- 
        tempFlights$flightCount / max(tempFlights$flightCount)
    tempVaxx <- covid %>%
        select(location, date, people_fully_vaccinated_per_hundred) %>% 
        filter(location == country & 
            (date >= as.Date("2021-02-01") & date <= as.Date("2021-10-31")))
    tempVaxx$people_fully_vaccinated_per_hundred[
        is.na(tempVaxx$people_fully_vaccinated_per_hundred)] <- 0
    # avoid dividing by zero
    if (max(tempVaxx$people_fully_vaccinated_per_hundred) != 0) {
        tempVaxx$people_fully_vaccinated_per_hundred <- 
            tempVaxx$people_fully_vaccinated_per_hundred / 
            max(tempVaxx$people_fully_vaccinated_per_hundred)
    }

    if (length(tempFlights$date) != length(tempVaxx$date)) {
        #print(sprintf("insufficient data for %s", country))
        next
    }
    
    flightImpact$flightVaxxCorrelation[flightImpact$location == country] <- 
        cor(tempFlights$flightCount, tempVaxx$people_fully_vaccinated_per_hundred)
}
rm(tempFlights, tempVaxx)

map_data("world") %>%
    rename(location = region) %>% 
    left_join(flightImpact, by = "location") %>% 
    ggplot(mapping = aes(x = long, y = lat, group = group,
                         fill = flightVaxxCorrelation)) +
    geom_polygon(color = "black") +
    coord_map(xlim = c(-180, 180), ylim=c(-60, 90)) +
    scale_fill_distiller(palette = "Spectral", direction = 1, limits=c(-1, 1), 
                         breaks=seq(-1,1,by=0.25)) +
    labs(x = element_blank(), y = element_blank()) +
    guides(fill = guide_legend(title = "correlation")) +
    ggtitle("Flight recovery rates vs. Vaccination rates from Feb. 2021 to now")

As one can see, vaccines seem to be much more effective in North America & Europe. This seems probable as these countries have a much better chance at higher vaccination rates & therefore higher airline recovery rates. Japan & South Korea have much more modest numbers compared to the rest of the 1st world. This is likely because they’re industry recovery happened sooner than the vaccination. Australia & New Zealand have negative correlation, which is due to their recent international travel caps set by the governments. We can reasonable conclude that in developed countries with good vaccination rates excluding the aforementioned exceptions, the vaccine seems to improve the recovery of these industries. Below is the same graph, but with the ability to view the numbers of specific countries.

pacman::p_load(plotly)
flightImpact %>% 
    plot_ly(type = "choropleth",
            locations = flightImpact$iso, 
            z = flightImpact$flightVaxxCorrelation,
            text = flightImpact$location,
            colorscale = "agsunset",
            reversescale=TRUE)